model:
  arch: blip2_vicuna_instruct_ckd
  model_type: vicuna7b
  load_finetuned: False
  load_pretrained: True

  pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/InstructBLIP/instruct_blip_vicuna7b_trimmed.pth"
  finetuned: ""

  # vit encoder
  image_size: 224
  drop_path_rate: 0
  use_grad_checkpoint: False
  vit_precision: "fp16"
  freeze_vit: True

  # Q-Former
  num_query_token: 32

  # path to Vicuna checkpoint
  llm_model: "lmsys/vicuna-7b-v1.1" # "./llm/vicuna-7b"

  # generation configs
  prompt: ""
  
  # kd
  kd_loss: "ckd" 
  alpha: 0.5 # pos_loss * alpha + neg_loss * (1-alpha)


datasets:
  vg_ckd:
    data_type: images 
    build_info:
      annotations:
        train:
          storage: vg/annotations/vg_objects_hallucinated_desc2.json
      images:
        storage: 'vg/images/' 
    vis_processor:
        train:
          name: "blip_image_train"
          image_size: 224
    text_processor:
        train:
          name: "blip_caption_ckd"
          max_words: 50

run:
  task: captioning
  # optimization-specific
  lr_sched: "linear_warmup_cosine_lr"
  init_lr: 1e-5
  min_lr: 1e-8
  warmup_lr: 1e-8
  weight_decay: 0.02
  max_epoch: 5
  warmup_steps: 1000 # it assumes less than one epoch
  batch_size_train: 2 
  batch_size_eval: 4
  num_workers: 4
  accum_grad_iters: 2

  # inference-specific
  max_len: 16
  min_len: 8
  num_beams: 5
  num_ans_candidates: 1
  inference_method: "rank"

  seed: 42
  output_dir: "VG/"

  amp: False
  resume_ckpt_path: null

  evaluate: False 
  train_splits: ["train"]
  # valid_splits: ["val"]
  # test_splits: ["test"]

  # distribution-specific
  device: "cuda"
  world_size: 1
  dist_url: "env://"
  distributed: True

